{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Question 8\n",
    "导入 MNIST 数据（第三章中介绍），把它切分进一个训练集，一个验证集，和一个测试\n",
    "集（例如 40000 个实例进行训练，10000 个进行验证，10000 个进行测试）。然后训练\n",
    "多个分类器，例如一个随机森林分类器，一个 Extra-Tree 分类器和一个 SVM。接下来，\n",
    "尝试将它们组合成集成，使用软或硬投票分类器来胜过验证集上的所有集合。一旦找到\n",
    "了，就在测试集上实验。与单个分类器相比，它的性能有多好？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function fetch_mldata is deprecated; fetch_mldata was deprecated in version 0.20 and will be removed in version 0.22. Please use fetch_openml.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "D:\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function mldata_filename is deprecated; mldata_filename was deprecated in version 0.20 and will be removed in version 0.22. Please use fetch_openml.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'DESCR': 'mldata.org dataset: mnist-original',\n",
       " 'COL_NAMES': ['label', 'data'],\n",
       " 'target': array([0., 0., 0., ..., 9., 9., 9.]),\n",
       " 'data': array([[0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        ...,\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.datasets import fetch_mldata\n",
    "mnist = fetch_mldata('MNIST original',data_home='../datasets/minst')\n",
    "mnist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将数据划分为训练数据 和测试数据\n",
    "X, y = mnist[\"data\"], mnist[\"target\"]\n",
    "X_train, X_check, X_test, y_train, y_check, y_test = X[:50000], X[50000:60000], X[60000:], y[:50000], y[50000:60000], y[60000:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### PCA压缩"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=154)\n",
    "X_mnist_reduced=pca.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VotingClassifier(estimators=[('rf',\n",
       "                              RandomForestClassifier(bootstrap=True,\n",
       "                                                     class_weight=None,\n",
       "                                                     criterion='gini',\n",
       "                                                     max_depth=None,\n",
       "                                                     max_features='auto',\n",
       "                                                     max_leaf_nodes=None,\n",
       "                                                     min_impurity_decrease=0.0,\n",
       "                                                     min_impurity_split=None,\n",
       "                                                     min_samples_leaf=1,\n",
       "                                                     min_samples_split=2,\n",
       "                                                     min_weight_fraction_leaf=0.0,\n",
       "                                                     n_estimators='warn',\n",
       "                                                     n_jobs=-1, oob_score=False,\n",
       "                                                     random_state=None,\n",
       "                                                     verbo...\n",
       "                                                   n_jobs=-1, oob_score=False,\n",
       "                                                   random_state=None, verbose=0,\n",
       "                                                   warm_start=False)),\n",
       "                             ('svc',\n",
       "                              SVC(C=1.0, cache_size=200, class_weight=None,\n",
       "                                  coef0=0.0, decision_function_shape='ovr',\n",
       "                                  degree=3, gamma='auto_deprecated',\n",
       "                                  kernel='rbf', max_iter=-1, probability=False,\n",
       "                                  random_state=None, shrinking=True, tol=0.001,\n",
       "                                  verbose=False))],\n",
       "                 flatten_transform=True, n_jobs=-1, voting='hard',\n",
       "                 weights=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "rnd_clf = RandomForestClassifier(n_jobs=-1)\n",
    "ett_clf = ExtraTreesClassifier(n_jobs=-1)\n",
    "svc_clf = SVC()\n",
    "\n",
    "voting_clf = VotingClassifier(estimators=[('rf', rnd_clf), ('et', ett_clf),('svc', svc_clf)],\n",
    "                              voting='hard', n_jobs=-1)\n",
    "voting_clf.fit(X_mnist_reduced, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Number of features of the model must match the input. Model n_features is 154 and input n_features is 784 ",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-5-b15c0c35b963>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mclf\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mrnd_clf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mett_clf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msvc_clf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvoting_clf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m     \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_mnist_reduced\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m     \u001b[0my_pred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py\u001b[0m in \u001b[0;36mpredict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    543\u001b[0m             \u001b[0mThe\u001b[0m \u001b[0mpredicted\u001b[0m \u001b[0mclasses\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    544\u001b[0m         \"\"\"\n\u001b[1;32m--> 545\u001b[1;33m         \u001b[0mproba\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    546\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    547\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    586\u001b[0m         \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'estimators_'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    587\u001b[0m         \u001b[1;31m# Check data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 588\u001b[1;33m         \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    589\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    590\u001b[0m         \u001b[1;31m# Assign chunk of trees to jobs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py\u001b[0m in \u001b[0;36m_validate_X_predict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    357\u001b[0m                                  \"call `fit` before exploiting the model.\")\n\u001b[0;32m    358\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 359\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mestimators_\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    360\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    361\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\tree\\tree.py\u001b[0m in \u001b[0;36m_validate_X_predict\u001b[1;34m(self, X, check_input)\u001b[0m\n\u001b[0;32m    400\u001b[0m                              \u001b[1;34m\"match the input. Model n_features is %s and \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    401\u001b[0m                              \u001b[1;34m\"input n_features is %s \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 402\u001b[1;33m                              % (self.n_features_, n_features))\n\u001b[0m\u001b[0;32m    403\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    404\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: Number of features of the model must match the input. Model n_features is 154 and input n_features is 784 "
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "for clf in (rnd_clf, ett_clf, svc_clf, voting_clf):\n",
    "    clf.fit(X_mnist_reduced, y_train)\n",
    "    y_pred = clf.predict(X_test)\n",
    "    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
